#setwd("~/Dropbox/Shrew_Bat_Diet_Trial/Scripts/Analyses")
library(phyloseq)
library(ggplot2)
library(dplyr)
library(vegan)
#library(decontam)
library(stringr)
library(knitr)
library(hilldiv)
library(kableExtra)
library(gridExtra)
library(reshape2)
Prior to this script, sequences have been clustered together into MOTUs using sumaclust at similarity thresholds between 95% and 98%. Singletons have already been removed and MOTUs have already been assigned to taxonomy using blastn and NCBI database (see script XXX). The following batch of script were used to process/filter the MOTUs and clarify how many MOTUs are retained after filtering (i.e. removing low abundance samples and MOTUs and removing non=prey taxa such as vertebrate or parasitic taxa).
The code is shown for Gillet at 95% and Zeale at 95% clustering. The same code is repeated for thresholds between 96% - 98%. The full code is available on request, however it is just repeated chunks.
# Load Dataset
load("../gillet_dataset/sumaclust95/phyloseq_object_clust95_iden_taxa_assigned_no_singletons.RData")
# Change MOTU names to gMOTU_1, gMOTU_2 etc etc
taxa_names(gillet.phylo95) <- paste("gMOTU", seq(nrow(tax_table(gillet.phylo95))), sep="_")
# quickly remove very low read samples
gillet.phylo <- prune_samples(sample_sums(gillet.phylo95) > 50, gillet.phylo95)
# examine what we're looking at
gillet.phylo
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2170 taxa and 50 samples ]
## sample_data() Sample Data: [ 50 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 2170 taxa by 10 taxonomic ranks ]
# Remove 'sample.' part of sample names that obitools annoyingly adds
chck <- sample_names(gillet.phylo)
chck <- str_remove(chck, "sample.")
sample_names(gillet.phylo) <- chck
Whats in the blanks?
n <- which(otu_table(gillet.phylo)[,"G_NEG"] > 0)
#otu_table(gillet.phylo)[n,"G_NEG"]
#tax_table(gillet.phylo)[n,3:7]
m <- which(otu_table(gillet.phylo)[,"G_Neg"] > 0)
#otu_table(gillet.phylo)[m,"G_Neg"]
#tax_table(gillet.phylo)[m,3:7]
l <- unique(c(n,m))
blnk.df <- as.data.frame(as.matrix(tax_table(gillet.phylo))[l,4:7])
blnk.df$total.reads <- taxa_sums(gillet.phylo)[l]
blnk.df$Neg1.reads <- otu_table(gillet.phylo)[l, "G_NEG"]
for (i in 1:nrow(blnk.df)) {blnk.df$Neg1.prop[i] <- (blnk.df$Neg1.reads[i] / blnk.df$total.reads[i]) * 100 }
blnk.df$Neg2.reads <- otu_table(gillet.phylo)[l, "G_Neg"]
for (i in 1:nrow(blnk.df)) {blnk.df$Neg2.prop[i] <- (blnk.df$Neg2.reads[i] / blnk.df$total.reads[i]) * 100 }
blnk.df$perc <- apply(blnk.df[,c("Neg1.prop","Neg2.prop")], 1, sum)
rownames(blnk.df) <- 1:nrow(blnk.df)
#kable(blnk.df, caption = "MOTUs identified in the blanks")
landscape(knitr::kable(blnk.df, caption = "MOTUs identified in the blanks") %>%
kable_styling(bootstrap_options = "striped",
full_width = F,
position = "left"))
| order | family | genus | species | total.reads | Neg1.reads | Neg1.prop | Neg2.reads | Neg2.prop | perc |
|---|---|---|---|---|---|---|---|---|---|
| Chiroptera | Rhinolophidae | Family_Rhinolophidae | Family_Rhinolophidae | 3975502 | 3241 | 0.0815243 | 1862 | 0.0468369 | 0.1283611 |
| Primates | Hominidae | Homo | Homo_sapiens | 955 | 11 | 1.1518325 | 11 | 1.1518325 | 2.3036649 |
| Class_Mammalia | Class_Mammalia | Class_Mammalia | Class_Mammalia | 2 | 1 | 50.0000000 | 0 | 0.0000000 | 50.0000000 |
| Eulipotyphla | Soricidae | Family_Soricidae | Family_Soricidae | 1941340 | 0 | 0.0000000 | 804 | 0.0414147 | 0.0414147 |
| Eulipotyphla | Soricidae | Crocidura | Crocidura_russula | 426731 | 0 | 0.0000000 | 2180 | 0.5108605 | 0.5108605 |
| Hymenoptera | Cynipidae | Neuroterus | Neuroterus_quercusbaccarum | 17794 | 0 | 0.0000000 | 5 | 0.0280994 | 0.0280994 |
| Diptera | Order_Diptera | Order_Diptera | Order_Diptera | 6552 | 0 | 0.0000000 | 3 | 0.0457875 | 0.0457875 |
| Diptera | Agromyzidae | Family_Agromyzidae | Family_Agromyzidae | 6202 | 0 | 0.0000000 | 2 | 0.0322477 | 0.0322477 |
| Diptera | Order_Diptera | Order_Diptera | Order_Diptera | 2231 | 0 | 0.0000000 | 2 | 0.0896459 | 0.0896459 |
| Eulipotyphla | Soricidae | Crocidura | Genus_Crocidura | 1530 | 0 | 0.0000000 | 1 | 0.0653595 | 0.0653595 |
| Primates | Hominidae | Homo | Homo_sapiens | 820 | 0 | 0.0000000 | 35 | 4.2682927 | 4.2682927 |
| Eulipotyphla | Soricidae | Family_Soricidae | Family_Soricidae | 1678 | 0 | 0.0000000 | 1 | 0.0595948 | 0.0595948 |
| Araneae | Linyphiidae | Hypomma | Hypomma_cornutum | 563 | 0 | 0.0000000 | 1 | 0.1776199 | 0.1776199 |
| Class_Mammalia | Class_Mammalia | Class_Mammalia | Class_Mammalia | 388 | 0 | 0.0000000 | 1 | 0.2577320 | 0.2577320 |
| Eulipotyphla | Order_Eulipotyphla | Order_Eulipotyphla | Order_Eulipotyphla | 208 | 0 | 0.0000000 | 1 | 0.4807692 | 0.4807692 |
| Diptera | Scathophagidae | Family_Scathophagidae | Family_Scathophagidae | 63 | 0 | 0.0000000 | 1 | 1.5873016 | 1.5873016 |
| Hymenoptera | Order_Hymenoptera | Order_Hymenoptera | Order_Hymenoptera | 49 | 0 | 0.0000000 | 1 | 2.0408163 | 2.0408163 |
| Eulipotyphla | Soricidae | Crocidura | Genus_Crocidura | 155 | 0 | 0.0000000 | 2 | 1.2903226 | 1.2903226 |
| Chiroptera | Order_Chiroptera | Order_Chiroptera | Order_Chiroptera | 84 | 0 | 0.0000000 | 1 | 1.1904762 | 1.1904762 |
| Class_Mammalia | Class_Mammalia | Class_Mammalia | Class_Mammalia | 86 | 0 | 0.0000000 | 1 | 1.1627907 | 1.1627907 |
| Class_Insecta | Class_Insecta | Class_Insecta | Class_Insecta | 31 | 0 | 0.0000000 | 2 | 6.4516129 | 6.4516129 |
| Class_Aves | Class_Aves | Class_Aves | Class_Aves | 2 | 0 | 0.0000000 | 1 | 50.0000000 | 50.0000000 |
| Class_Aves | Class_Aves | Class_Aves | Class_Aves | 2 | 0 | 0.0000000 | 1 | 50.0000000 | 50.0000000 |
| Class_Aves | Class_Aves | Class_Aves | Class_Aves | 5 | 0 | 0.0000000 | 1 | 20.0000000 | 20.0000000 |
| Diptera | Culicidae | Family_Culicidae | Family_Culicidae | 4 | 0 | 0.0000000 | 4 | 100.0000000 | 100.0000000 |
| Diptera | Order_Diptera | Order_Diptera | Order_Diptera | 2 | 0 | 0.0000000 | 1 | 50.0000000 | 50.0000000 |
| Diptera | Tipulidae | Tipula | Genus_Tipula | 3 | 0 | 0.0000000 | 1 | 33.3333333 | 33.3333333 |
Remove taxa of which the blanks hold over 2% of the total reads for that MOTU
tab.nam <- blnk.df[,"perc"] > 2 # 2 is for 2%
tab.df <- blnk.df[tab.nam,]
removeTaxa <- rownames(tab.df) # Lists the MOTUs to remove
phy.obj <- subset_taxa(gillet.phylo, !(taxa_names(gillet.phylo) %in% removeTaxa))
# How many MOTUs left?
phy.obj
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2170 taxa and 50 samples ]
## sample_data() Sample Data: [ 50 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 2170 taxa by 10 taxonomic ranks ]
# Spare palette from https://graphicdesign.stackexchange.com/questions/3682/where-can-i-find-a-large-palette-set-of-contrasting-colors-for-coloring-many-d
# Palette is from study to create most contrasting colours
pal.o = c("#f0a3ff", # Araneae
"#0075dc", # Arch *
"#993f00", # Coleoptera
"#4c005c", # Diptera *
"#191919", # Ento *
"#005c31", # Geo *
"#2bce48", # Glom
"#ffcc99", # Hap *
"#808080", # Hemip
"#94ffb5", # Hymen
"#8f7c00", # Isopoda
"#9dcc00", # Julida
"#c20088", # Lepidop
"blue", # Lithobiomorpha *
"#ffa405", # Mesostig
"#ffa8bb", # Neuropter *
"#426600", # Oppiliones *
"#ff0010", # Orbitida *
"#5ef1f2",# Orthoptera *
"#00998f", # Poly *
"#e0ff66", # psocoptera
"indianred", # Sarc
"#003380", # stylo
"green", # trom *
"khaki4",
"darkred",
"coral4",
"violetred2",
"#0075dc",
"#993f00")
samples.phylo <- gillet.phylo
samples.phylo <- tax_glom(gillet.phylo, taxrank = "order")
n <- grep("Chiroptera", tax_table(samples.phylo)[,"order"])
m <- grep("Eulipotyphla", tax_table(samples.phylo)[,"order"])
tax_table(samples.phylo)[-c(n, m), 1:4] <- "Other"
# What are the 3 unique taxa orders?
unique(tax_table(samples.phylo)[,"order"])
## Taxonomy Table: [3 taxa by 1 taxonomic ranks]:
## order
## gMOTU_1 "Chiroptera"
## gMOTU_2 "Eulipotyphla"
## gMOTU_5 "Other"
# Agglomerate MOTUs to order level
mm.oth <- tax_glom(samples.phylo, taxrank = "order")
# Note that phyloseq has 3 taxa (Chiroptera, Eulipotyphla, Other)
mm.oth
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 3 taxa and 50 samples ]
## sample_data() Sample Data: [ 50 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 3 taxa by 10 taxonomic ranks ]
# transform read counts to relative abundance
mm.oth.ra = transform_sample_counts(mm.oth, function(x) 100 * x/sum(x))
ra.samples.bar <- phyloseq::plot_bar(mm.oth.ra) # extracts information needed for barplots
ra.samples.bar.data <- ra.samples.bar$data
p1.95 <- ggplot(ra.samples.bar.data, aes(x= Sample, y=Abundance, fill = order)) +
geom_bar(stat="identity", color="black") +
scale_fill_manual(values = pal.o) +
facet_wrap(~ mammal, scale = "free_x") +
theme_classic() +
theme(legend.position = "right") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p1.95
Composition of prey reads compared to host - clustered at 95%
Check range of vertebrate amplification in samples
samples.phylo <- subset_samples(phy.obj, mammal != "BLANK")
n <- grep("Chordata", tax_table(samples.phylo)[,"phylum"])
tax_table(samples.phylo)[-n, 1:4] <- "Other"
#unique(tax_table(samples.phylo)[,"phylum"])
mm.oth <- tax_glom(samples.phylo, taxrank = "phylum")
#mm.oth
mm.oth.ra = transform_sample_counts(mm.oth, function(x) 100 * x/sum(x))
# Check Taxa
taxa_names(mm.oth.ra) <- c("Vertebrate", "Other")
tax_table(mm.oth.ra)[,1:2]
## Taxonomy Table: [2 taxa by 2 taxonomic ranks]:
## kingdom phylum
## Vertebrate "Metazoa" "Chordata"
## Other "Other" "Other"
# Proportion of host reads in each sample
otu_table(mm.oth.ra)
## OTU Table: [2 taxa and 48 samples]
## taxa are rows
## G_1 G_10 G_11 G_12 G_13 G_14
## Vertebrate 89.06104 97.44169 98.685607 98.875154 97.660007 92.095764
## Other 10.93896 2.55831 1.314393 1.124846 2.339993 7.904236
## G_15 G_16 G_17 G_18 G_19 G_2
## Vertebrate 97.956883 86.62167 78.16822 97.878097 99.5345188 97.801715
## Other 2.043117 13.37833 21.83178 2.121903 0.4654812 2.198285
## G_20 G_21 G_22 G_23 G_24 G_3
## Vertebrate 97.866227 91.206891 89.45764 84.74596 64.51326 99.6463573
## Other 2.133773 8.793109 10.54236 15.25404 35.48674 0.3536427
## G_4 G_5 G_6 G_7 G_8 G_9
## Vertebrate 99.4643312 90.73662 94.264268 94.297427 95.609202 95.563988
## Other 0.5356688 9.26338 5.735732 5.702573 4.390798 4.436012
## G_E26 G_E27 G_E29 G_E31 G_E32 G_E33
## Vertebrate 96.158934 99.6653127 11.93543 92.15071 26.50477 90.026674
## Other 3.841066 0.3346873 88.06457 7.84929 73.49523 9.973326
## G_E47 G_FR14 G_FR15 G_FR16 G_FR17 G_FR20
## Vertebrate 99.0556457 4.38972 3.199108 5.803229 90.425228 91.468491
## Other 0.9443543 95.61028 96.800892 94.196771 9.574772 8.531509
## G_FR21 G_S18 G_S19 G_S21 G_S33 G_S34
## Vertebrate 89.20737 95.417696 22.61281 40.02194 98.843105 99.8711241
## Other 10.79263 4.582304 77.38719 59.97806 1.156895 0.1288759
## G_S48 G_S50 G_S51 G_W12 G_W13 G_W14
## Vertebrate 95.932179 74.20471 0.8261888 95.950804 79.85832 86.18168
## Other 4.067821 25.79529 99.1738112 4.049196 20.14168 13.81832
# Range of Vertebrate amplification across all samples (%)
range(otu_table(mm.oth.ra)[1,])
## [1] 0.8261888 99.8711241
# Range of Vertebrate amplification across GWTS (%)
gwts.prop <- subset_samples(mm.oth.ra, mammal == "GWTS")
range(otu_table(gwts.prop)[1,])
## [1] 89.20737 99.87112
# Range of Vertebrate amplification across Pygmies (%)
pyg.prop <- subset_samples(mm.oth.ra, mammal == "Pygmy")
range(otu_table(pyg.prop)[1,])
## [1] 0.8261888 95.9508037
# Range of Vertebrate amplification across Bats (%)
bat.prop <- subset_samples(mm.oth.ra, mammal == "Bat")
range(otu_table(bat.prop)[1,])
## [1] 64.51326 99.64636
# Remove negative controls
samples.phylo <- subset_samples(phy.obj, mammal != "BLANK")
# Remove non-prey taxa
diet.prey <- subset_taxa(samples.phylo, !(class %in% c("Mammalia",
"none",
"Actinopteri",
"Bdelloidea",
"Udeonychophora", # velvet worms
"Merostomata", # horse shoe crabs
"Gammaproteobacteria", # bacteria
"Magnoliopsida", # plants
"Monogononta", # rotifers
"Dothideomycetes", # fungi
"Trebouxiophyceae", # green algae
"Chondrichthyes", # Cartilaginous fish
"Mucoromycetes", # fungi
"Phylum_Endomyxa", # micro things
"Eutardigrada", # tartigrades!!
"Elardia", # Amoebas
"Cephalopoda", # Cephalopods
"Amphibia", # Amphibians
"Aves", # Birds
"Chromadorea", # roundworms
"Hexanauplia", # parasitic crustaceans
"Kingdom_Metazoa",
"Kingdom_",
"Phylum_Discosea", # amoebas
"Branchiopoda", # marine crustaceans
"Phylum_Nematoda")))
# remove samples with less than 1000 reads
sampl.filt <- prune_samples(sample_sums(diet.prey) > 1000, diet.prey)
# Extract OTU table
otu.tab <- as.data.frame(otu_table(sampl.filt))
# Remove MOTUs from samples that have < 0.01% total reads of that sample
new.otu.tab <- hilldiv::copy_filt(otu.tab, 0.0001)
# Assign cleaned matrix to phyloseq object
new.otu.tab <- as.matrix(new.otu.tab)
otu_table(sampl.filt) <- otu_table(new.otu.tab, taxa_are_rows = TRUE)
# Check amount of remaining MOTUs
sampl.filt
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 1075 taxa and 44 samples ]
## sample_data() Sample Data: [ 44 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 1075 taxa by 10 taxonomic ranks ]
# Remove MOTUs that are represented by < 5 reads in total
final.diet <- prune_taxa(taxa_sums(sampl.filt) > 4, sampl.filt)
# The final dataset is...
final.diet
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 540 taxa and 44 samples ]
## sample_data() Sample Data: [ 44 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 540 taxa by 10 taxonomic ranks ]
order.glom <- tax_glom(final.diet, taxrank= "order")
ntx <- as.numeric(ntaxa(order.glom))
top25ord <- prune_taxa(names(sort(taxa_sums(order.glom),TRUE)[1:19]), order.glom)
othr.ord <- prune_taxa(names(sort(taxa_sums(order.glom),TRUE)[20:ntx]), order.glom)
n <- taxa_names(othr.ord)
tax_table(order.glom)[n, 1:4] <- "Other"
n <- grep("Class_", tax_table(order.glom)[,"order"])
tax_table(order.glom)[n, 1:4] <- "Other"
top25ord <- tax_glom(order.glom, taxrank= "order")
ra.samples = transform_sample_counts(top25ord, function(x) 100 * x/sum(x))
ra.samples.bar <- phyloseq::plot_bar(ra.samples) # extracts information needed for barplots
ra.samples.bar.data <- ra.samples.bar$data
p2.95 <- ggplot(ra.samples.bar.data, aes(x= Sample, y=Abundance, fill = order)) +
geom_bar(stat="identity", color="black") +
scale_fill_manual(values = pal.o) +
facet_wrap(~ mammal, scale = "free_x") +
theme_classic() + # appearance composition
theme(strip.text.x = element_text(size = 15, face = "bold")) + # facet text
# scale_x_discrete(labels=c("Bat_Gillet" = "Bat\nN=22", "GWTS_Gillet" = "GWTS\nN=7", "Pygmy_Gillet" = "Pygmy\nN=15",
# "Bat_Zeale" = "Bat\nN=23", "GWTS_Zeale" = "GWTS\nN=4", "Pygmy_Zeale" = "Pygmy\nN=11")) +
theme(legend.position = "right",
legend.title = element_blank(),
legend.text = element_text(size = 10),
legend.key.size = unit(0.8, "cm")) +
ggtitle("Composition of diet - clustered at 95%") +
labs(y = "Relative Abundance") +
theme(axis.title.x = element_blank(),
axis.title.y = element_text(size = 20,
face = "bold")) +
theme(axis.text.x = element_text(size = 12 ,
face = "bold",
angle = 45,
hjust = 1),
axis.text.y = element_text(size = 12,
face = "bold"))
p2.95
Composition of prey reads compared to host - clustered at 95%
# Range of sample depth of samples
range(sample_sums(final.diet))
## [1] 1120 318945
# Range of read depth of taxa
range(taxa_sums(final.diet))
## [1] 5 172198
# Average read depth per sample
mean(sample_sums(final.diet))
## [1] 37982.2
# hill_div packages assessment taxa coverage per sample, according to a shannon diversity equivilent
depth_cov(new.otu.tab,
qvalue = 1)
## Depth Observed Estimated Coverage
## G_1 35501 9.93 9.94 99.89
## G_10 5049 17.14 17.24 99.42
## G_11 2413 14.42 14.56 99.00
## G_12 1384 13.94 14.21 98.14
## G_13 1724 8.09 8.18 98.93
## G_14 7499 9.06 9.10 99.57
## G_15 3594 14.39 14.61 98.52
## G_16 15633 21.64 21.70 99.73
## G_17 23338 1.62 1.62 99.96
## G_18 1496 3.23 3.25 99.54
## G_2 7363 4.96 4.97 99.64
## G_20 2049 10.10 10.17 99.31
## G_21 14261 12.25 12.28 99.76
## G_22 7766 7.89 7.91 99.68
## G_23 25038 5.54 5.55 99.91
## G_24 35371 4.17 4.18 99.93
## G_4 1219 2.76 2.78 99.35
## G_5 18790 1.89 1.89 99.95
## G_6 11034 3.92 3.93 99.88
## G_7 10144 1.52 1.52 99.85
## G_8 9687 12.37 12.42 99.63
## G_9 7021 14.87 14.95 99.50
## G_E26 8841 4.31 4.33 99.70
## G_E29 98018 7.17 7.18 99.99
## G_E31 3088 2.69 2.71 99.43
## G_E32 65729 4.01 4.02 99.99
## G_E33 20263 4.32 4.32 99.95
## G_E47 2496 1.12 1.12 99.62
## G_FR14 318945 4.19 4.19 100.00
## G_FR15 279150 3.23 3.23 99.99
## G_FR16 181397 4.94 4.94 99.99
## G_FR17 1713 2.86 2.87 99.85
## G_FR20 19879 1.47 1.47 99.93
## G_FR21 42139 7.80 7.81 99.97
## G_S18 4378 4.33 4.33 99.82
## G_S19 72526 4.64 4.64 99.98
## G_S21 34698 3.63 3.63 99.98
## G_S33 2475 4.88 4.91 99.47
## G_S48 10581 1.54 1.54 99.95
## G_S50 13493 11.25 11.26 99.93
## G_S51 225301 2.50 2.50 100.00
## G_W12 1123 3.82 3.86 99.15
## G_W13 6602 3.42 3.43 99.70
## G_W14 11386 6.75 6.75 99.96
# Seperate phyloseq object per mammal species
Bat_G <- prune_samples(sample_data(final.diet)$mammal == "Bat", final.diet)
df1 <- as.data.frame(t(as.matrix(otu_table(Bat_G))))
gwts_G <- prune_samples(sample_data(final.diet)$mammal == "GWTS", final.diet)
df2 <- as.data.frame(t(as.matrix(otu_table(gwts_G))))
pyg_G <- prune_samples(sample_data(final.diet)$mammal == "Pygmy", final.diet)
df3 <- as.data.frame(t(as.matrix(otu_table(pyg_G))))
# Grab random subset of 10 individuals from each mammal species to test
set.seed(57)
# Bat sample rarefaction
x <- sample(nrow(df1), 10)
r1 <- rarecurve(df1[x,])
# GWTS sample rarefaction
#y <- sample(nrow(df2), 10)
r2 <- rarecurve(df2[,])
# Pygmy sample rarefaction
y <- sample(nrow(df3), 10)
r3 <- rarecurve(df3[y,])
The following is an alternative way to show the rarefaction results
col <- c("black", "darkred", "forestgreen", "hotpink", "blue")
lty <- c("solid", "dashed", "dotdash")
lwd <- c(1, 2)
pars <- expand.grid(col = col, lty = lty, lwd = lwd,
stringsAsFactors = FALSE)
head(pars)
out <- r1
raremax <- 1000
Nmax <- sapply(out, function(x) max(attr(x, "Subsample")))
Smax <- sapply(out, max)
plot(c(1, max(Nmax)), c(1, max(Smax)), main = "Gillet - Bat - Rarefaction",
xlab = "Sample Size",
ylab = "Species", type = "n", ylim = c(0,150))
#abline(v = raremax)
for (i in seq_along(out)) {
N <- attr(out[[i]], "Subsample")
with(pars, lines(N, out[[i]], col = col[i], lty = lty[i], lwd = lwd[i]))
}
out <- r2
raremax <- 1000
Nmax <- sapply(out, function(x) max(attr(x, "Subsample")))
Smax <- sapply(out, max)
plot(c(1, max(Nmax)), c(1, max(Smax)), main = "Gillet - GWTS - Rarefaction",
xlab = "Sample Size",
ylab = "Species", type = "n", ylim = c(0,150))
#abline(v = raremax)
for (i in seq_along(out)) {
N <- attr(out[[i]], "Subsample")
with(pars, lines(N, out[[i]], col = col[i], lty = lty[i], lwd = lwd[i]))
}
out <- r3
raremax <- 1000
Nmax <- sapply(out, function(x) max(attr(x, "Subsample")))
Smax <- sapply(out, max)
plot(c(1, max(Nmax)), c(1, max(Smax)), main = "Gillet - Pygmy - Rarefaction",
xlab = "Sample Size",
ylab = "Species", type = "n", ylim = c(0,150))
#abline(v = raremax)
for (i in seq_along(out)) {
N <- attr(out[[i]], "Subsample")
with(pars, lines(N, out[[i]], col = col[i], lty = lty[i], lwd = lwd[i]))
}
The following is a chunky code to count the MOTUs that were correctly assigned to taxonomy at various levels, accoring to this studies criteria set and specified in the main text.
It doe not count when the taxonomy contains “_sp." etc in the name
final.diet.g <- final.diet
# Extract taxonomy table
df1 <- as.data.frame(as.matrix(tax_table(final.diet.g)))
df <- df1
df$pident <- as.character(df$pident)
df$pident <- as.numeric(df$pident)
###################
## Species
n <- which(df[,"pident"] > 97.9999)
gn <- grep("Genus_", df[n,"species"])
fm <- grep("Family_", df[n,"species"])
or <- grep("Order_", df[n,"species"])
cl <- grep("Class_", df[n,"species"])
ph <- grep("Phylum_", df[n,"species"])
kg <- grep("Kindgom_", df[n,"species"])
nn <- grep("none", df[n,"species"])
s.p <- grep("_sp._", df[n,"species"])
n.r <- grep("_nr._", df[n,"species"])
c.f <- grep("_cf._", df[n,"species"])
x <- length(c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f))
sp.y <- length(df[n,"species"]) - x
chck <- df[n[-c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f)],"species"]
un.sp.y <- length(unique(chck))
###################
## Genus
df <- df[-n,]
n <- which(df[,"pident"] > 94.9999)
gn <- grep("Genus_", df[n,"genus"])
fm <- grep("Family_", df[n,"genus"])
or <- grep("Order_", df[n,"genus"])
cl <- grep("Class_", df[n,"genus"])
ph <- grep("Phylum_", df[n,"genus"])
kg <- grep("Kindgom_", df[n,"genus"])
nn <- grep("none", df[n,"genus"])
s.p <- grep("_sp._", df[n,"genus"])
n.r <- grep("_nr._", df[n,"genus"])
c.f <- grep("_cf._", df[n,"genus"])
g.g <- grep("_gen._", df[n,"genus"])
x <- length(c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f,g.g))
gn.y <- length(df[n,"genus"]) - x
gn <- grep("Genus_", df1[,"genus"])
fm <- grep("Family_", df1[,"genus"])
or <- grep("Order_", df1[,"genus"])
cl <- grep("Class_", df1[,"genus"])
ph <- grep("Phylum_", df1[,"genus"])
kg <- grep("Kindgom_", df1[,"genus"])
nn <- grep("none", df1[,"genus"])
s.p <- grep("_sp._", df1[,"genus"])
n.r <- grep("_nr._", df1[,"genus"])
c.f <- grep("_cf._", df1[,"genus"])
g.g <- grep("_gen._", df1[,"genus"])
chck <- df1[-c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f),"genus"]
un.gn.y <- length(unique(chck))
###################
## Family
df <- df[-n,]
n <- which(df[,"pident"] > 92.999)
gn <- grep("Genus_", df[n,"family"])
fm <- grep("Family_", df[n,"family"])
or <- grep("Order_", df[n,"family"])
cl <- grep("Class_", df[n,"family"])
ph <- grep("Phylum_", df[n,"family"])
kg <- grep("Kindgom_", df[n,"family"])
nn <- grep("none", df[n,"family"])
s.p <- grep("_sp._", df[n,"family"])
n.r <- grep("_nr._", df[n,"family"])
c.f <- grep("_cf._", df[n,"family"])
g.g <- grep("_gen._", df[n,"family"])
x <- length(c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f,g.g))
fm.y <- length(df[n,"family"]) - x
gn <- grep("Genus_", df1[,"family"])
fm <- grep("Family_", df1[,"family"])
or <- grep("Order_", df1[,"family"])
cl <- grep("Class_", df1[,"family"])
ph <- grep("Phylum_", df1[,"family"])
kg <- grep("Kindgom_", df1[,"family"])
nn <- grep("none", df1[,"family"])
s.p <- grep("_sp._", df1[,"family"])
n.r <- grep("_nr._", df1[,"family"])
c.f <- grep("_cf._", df1[,"family"])
g.g <- grep("_gen._", df1[,"family"])
chck <- df1[-c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f),"family"]
un.fm.y <- length(unique(chck))
###################
## Order
df <- df[-n,]
n <- which(df[,"pident"] > 89.9999)
gn <- grep("Genus_", df[n,"order"])
fm <- grep("Family_", df[n,"order"])
or <- grep("Order_", df[n,"order"])
cl <- grep("Class_", df[n,"order"])
ph <- grep("Phylum_", df[n,"order"])
kg <- grep("Kindgom_", df[n,"order"])
nn <- grep("none", df[n,"order"])
s.p <- grep("_sp._", df[n,"order"])
n.r <- grep("_nr._", df[n,"order"])
c.f <- grep("_cf._", df[n,"order"])
g.g <- grep("_gen._", df[n,"order"])
x <- length(c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f,g.g))
or.y <- length(df[n,"order"]) - x
gn <- grep("Genus_", df1[,"order"])
fm <- grep("Family_", df1[,"order"])
or <- grep("Order_", df1[,"order"])
cl <- grep("Class_", df1[,"order"])
ph <- grep("Phylum_", df1[,"order"])
kg <- grep("Kindgom_", df1[,"order"])
nn <- grep("none", df1[,"order"])
s.p <- grep("_sp._", df1[,"order"])
n.r <- grep("_nr._", df1[,"order"])
c.f <- grep("_cf._", df1[,"order"])
g.g <- grep("_gen._", df1[,"order"])
chck <- df1[-c(gn,fm,or,cl,ph,kg,nn,s.p,n.r,c.f),"order"]
un.or.y <- length(unique(chck))
df <- df1
#un.gn.y <- length(unique(df1[,"genus"]))
#un.fm.y <- length(unique(df1[,"family"]))
#un.or.y <- length(unique(df1[,"order"]))
# Create a table to save the results
tabx <- data.frame(primer = NA,
order = NA, unique.orders = NA,
family = NA, unique.families = NA,
genus = NA, unique.genus = NA,
species = NA, unique.species = NA,
total.taxa = NA,
total.reads = NA)
# Save results into the table/dataframe
tabx[1,] <- c("Gillet95", or.y, un.or.y, fm.y, un.fm.y,
gn.y, un.gn.y, sp.y, un.sp.y,
ntaxa(final.diet.g), sum(sample_sums(final.diet.g)))
Now that results have been saved in the dataframe, we repeat with each dataset and save the results in “tabx” dataframe
set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r4 <- rarecurve(df1[x,])
# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r5 <- rarecurve(df2[,])
# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r6 <- rarecurve(df3[y,])
set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r7 <- rarecurve(df1[x,])
# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r8 <- rarecurve(df2[,])
# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r9 <- rarecurve(df3[y,])
set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r10 <- rarecurve(df1[x,])
# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r11 <- rarecurve(df2[,])
# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r12 <- rarecurve(df3[y,])
gridExtra::grid.arrange(p2.95,
p2.96,
p2.97,
p2.98,
nrow = 2)
Comparing final diet composition when sequences are clustered at different thresholds: 95% - 98%
kable(tabx, caption = "Number of MOTUs assigned to taxonomy of different levels") %>%
kable_styling(bootstrap_options = "striped",
full_width = F,
position = "left")
| primer | order | unique.orders | family | unique.families | genus | unique.genus | species | unique.species | total.taxa | total.reads |
|---|---|---|---|---|---|---|---|---|---|---|
| Gillet95 | 72 | 24 | 82 | 120 | 72 | 207 | 217 | 211 | 540 | 1671217 |
| Gillet96 | 71 | 25 | 79 | 123 | 116 | 216 | 233 | 220 | 606 | 1668404 |
| Gillet97 | 82 | 25 | 85 | 123 | 180 | 218 | 271 | 228 | 736 | 1663063 |
| Gillet98 | 95 | 25 | 107 | 125 | 228 | 223 | 371 | 232 | 945 | 1652436 |
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2222 taxa and 51 samples ]
## sample_data() Sample Data: [ 51 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 2222 taxa by 10 taxonomic ranks ]
| order | family | genus | species | total.reads | Neg1.reads | Neg1.prop | Neg2.reads | Neg2.prop | perc |
|---|---|---|---|---|---|---|---|---|---|
| Diptera | Tipulidae | Tipula | Genus_Tipula | 111029 | 170 | 0.1531132 | 0 | 0.0000000 | 0.1531132 |
| Diptera | Tipulidae | Tipula | Genus_Tipula | 10 | 10 | 100.0000000 | 0 | 0.0000000 | 100.0000000 |
| Diptera | Tipulidae | Family_Tipulidae | Family_Tipulidae | 11842 | 0 | 0.0000000 | 1 | 0.0084445 | 0.0084445 |
Composition of prey reads compared to host - clustered at 95%
# remove samples with less than 1000 reads
sampl.filt <- prune_samples(sample_sums(diet.prey) > 1000, diet.prey)
# Extract OTU table
otu.tab <- as.data.frame(otu_table(sampl.filt))
# Remove MOTUs from samples that have < 0.01% total reads of that sample
new.otu.tab <- hilldiv::copy_filt(otu.tab, 0.0001)
# Assign cleaned matrix to phyloseq object
new.otu.tab <- as.matrix(new.otu.tab)
otu_table(sampl.filt) <- otu_table(new.otu.tab, taxa_are_rows = TRUE)
# Check amount of remaining MOTUs
sampl.filt
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2115 taxa and 38 samples ]
## sample_data() Sample Data: [ 38 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 2115 taxa by 10 taxonomic ranks ]
# Remove MOTUs that are represented by < 5 reads in total
final.diet <- prune_taxa(taxa_sums(sampl.filt) > 4, sampl.filt)
# The final dataset is...
final.diet
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 417 taxa and 38 samples ]
## sample_data() Sample Data: [ 38 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 417 taxa by 10 taxonomic ranks ]
# Range of sample depth of samples
range(sample_sums(final.diet))
## [1] 2953 620979
# Range of read depth of taxa
range(taxa_sums(final.diet))
## [1] 5 526148
# Average read depth per sample
mean(sample_sums(final.diet))
## [1] 160684.3
# hill_div packages assessment taxa coverage per sample, according to a shannon diversity equivilent
depth_cov(new.otu.tab,
qvalue = 1)
## Depth Observed Estimated Coverage
## Z_10 206188 6.06 6.06 100.00
## Z_11 252639 4.49 4.49 99.99
## Z_12 200456 3.49 3.50 99.99
## Z_13 241716 3.04 3.04 99.99
## Z_14 85475 6.01 6.01 99.99
## Z_15 246599 7.41 7.41 99.99
## Z_16 2978 7.43 7.52 98.76
## Z_17 235886 5.10 5.10 100.00
## Z_18 4607 2.14 2.14 99.80
## Z_19 345250 4.22 4.22 100.00
## Z_2 11971 4.04 4.04 99.95
## Z_20 7162 3.53 3.54 99.54
## Z_21 213317 9.97 9.98 99.99
## Z_22 264815 1.53 1.53 100.00
## Z_23 29968 7.92 7.92 99.95
## Z_24 310608 5.62 5.62 99.99
## Z_3 21562 1.94 1.94 99.98
## Z_4 13201 4.85 4.85 99.97
## Z_5 204439 4.19 4.19 99.99
## Z_6 140527 6.79 6.79 99.99
## Z_7 212066 6.73 6.73 99.99
## Z_8 4466 7.92 8.00 98.96
## Z_9 96952 1.80 1.80 100.00
## Z_E26 83067 2.19 2.19 100.00
## Z_E29 225498 1.22 1.22 100.00
## Z_E31 16701 1.10 1.10 99.97
## Z_E32 278483 2.88 2.88 100.00
## Z_E33 57603 1.11 1.11 99.98
## Z_FR14 203171 2.21 2.21 100.00
## Z_FR15 239751 3.49 3.49 99.99
## Z_FR16 620979 1.93 1.93 100.00
## Z_FR20 10716 2.11 2.11 99.92
## Z_FR21 166760 1.98 1.98 100.00
## Z_S19 35303 2.29 2.29 99.99
## Z_S21 433836 1.31 1.31 100.00
## Z_S50 97089 1.75 1.75 100.00
## Z_S51 205273 3.32 3.32 100.00
## Z_W13 79030 1.03 1.03 100.00
# Grab random subset of 10 individuals from each mammal species to test
set.seed(57)
# Bat sample rarefaction
x <- sample(nrow(df1), 10)
r1 <- rarecurve(df1[x,])
# GWTS sample rarefaction
#y <- sample(nrow(df2), 10)
r2 <- rarecurve(df2[,])
# Pygmy sample rarefaction
y <- sample(nrow(df3), 10)
r3 <- rarecurve(df3[y,])
set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r4 <- rarecurve(df1[x,])
# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r5 <- rarecurve(df2[,])
# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r6 <- rarecurve(df3[y,])
set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r7 <- rarecurve(df1[x,])
# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r8 <- rarecurve(df2[,])
# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r9 <- rarecurve(df3[y,])
set.seed(57)
# Bat Rarefaction Curve
x <- sample(nrow(df1), 10)
r10 <- rarecurve(df1[x,])
# GWTS Rarefaction Curve
#y <- sample(nrow(df2), 10)
r11 <- rarecurve(df2[,])
# Pygmy Rarefaction Curve
y <- sample(nrow(df3), 10)
r12 <- rarecurve(df3[y,])
gridExtra::grid.arrange(p2.95z,
p2.96z,
p2.97z,
p2.98z,
nrow = 2)
Comparing final diet composition when sequences are clustered at different thresholds: 95% - 98%
kable(tabx, caption = "Number of MOTUs assigned to taxonomy of different levels") %>%
kable_styling(bootstrap_options = "striped",
full_width = F,
position = "left")
| primer | order | unique.orders | family | unique.families | genus | unique.genus | species | unique.species | total.taxa | total.reads |
|---|---|---|---|---|---|---|---|---|---|---|
| Gillet95 | 72 | 24 | 82 | 120 | 72 | 207 | 217 | 211 | 540 | 1671217 |
| Gillet96 | 71 | 25 | 79 | 123 | 116 | 216 | 233 | 220 | 606 | 1668404 |
| Gillet97 | 82 | 25 | 85 | 123 | 180 | 218 | 271 | 228 | 736 | 1663063 |
| Gillet98 | 95 | 25 | 107 | 125 | 228 | 223 | 371 | 232 | 945 | 1652436 |
| Zeale95 | 36 | 14 | 95 | 85 | 113 | 162 | 130 | 127 | 417 | 6106005 |
| Zeale96 | 39 | 14 | 90 | 82 | 173 | 172 | 142 | 133 | 499 | 6104836 |
| Zeale97 | 44 | 13 | 125 | 85 | 279 | 189 | 217 | 146 | 750 | 6087051 |
| Zeale98 | 50 | 13 | 125 | 84 | 325 | 195 | 297 | 158 | 927 | 6058860 |
tabx$primer <- c(rep("Gillet", 4), rep("Zeale", 4))
df <- melt(data = tabx, id.vars = "primer",
measure.vars = c("order",
"family",
"genus",
"species"))
df$clust <- rep(c("95%", "96%", "97%", "98%"), 8)
df$variable <- paste(df$variable, df$clust, sep = " ")
df$value <- as.numeric(df$value)
df$primer <- factor(df$primer, levels = c("Zeale", "Gillet"))
df$variable <- factor(df$variable, levels = c("order 95%",
"order 96%",
"order 97%",
"order 98%",
"family 95%",
"family 96%",
"family 97%",
"family 98%",
"genus 95%",
"genus 96%",
"genus 97%",
"genus 98%",
"species 95%",
"species 96%",
"species 97%",
"species 98%"))
p <- ggplot(df, aes(x = variable, y=value)) +
geom_bar(aes(fill = primer),
stat = "identity",
color = "black",
position = position_dodge()) +
theme_classic() +
scale_fill_manual(values = c("darkblue", "pink")) +
scale_x_discrete(labels=c("order 95%" = "Order - 95%",
"order 96%" = "Order - 96%",
"order 97%" = "Order - 97%",
"order 98%" = "Order - 98%",
"family 95%" = "Family - 95%",
"family 96%" = "Family - 96%",
"family 97%" = "Family - 97%",
"family 98%" = "Family - 98%",
"genus 95%" = "Genus - 95%",
"genus 96%" = "Genus - 96%",
"genus 97%" = "Genus - 97%",
"genus 98%" = "Genus - 98%",
"species 95%" = "Species - 95%",
"species 96%" = "Species - 96%",
"species 97%" = "Species - 97%",
"species 98%" = "Species - 98%")) +
theme(axis.text.x = element_text(size = 12, angle = 45, hjust = 1,
face = "bold"),
axis.text.y = element_text(size = 12, face = "bold")) +
labs(y = "Number Identified") +
theme(legend.position = c(0.15,0.8)) +
theme(legend.title = element_blank(),
legend.text = element_text(size = 20)) +
#theme(legend.position = "bottom") +
theme(axis.title.x = element_blank(),
axis.title.y = element_text(size = 15, face = "bold"))
p
tabprop <- tabx
tabprop[2:11] <- sapply(tabprop[2:11],as.numeric)
for (i in 1:8){
tabprop[i,"order"] <- (tabprop[i,"order"]/tabprop[i, "total.taxa"]) * 100
tabprop[i,"family"] <- (tabprop[i,"family"]/tabprop[i, "total.taxa"]) * 100
tabprop[i,"genus"] <- (tabprop[i,"genus"]/tabprop[i, "total.taxa"]) * 100
tabprop[i,"species"] <- (tabprop[i,"species"]/tabprop[i, "total.taxa"]) * 100
}
df <- melt(data = tabprop, id.vars = "primer",
measure.vars = c("order",
"family",
"genus",
"species"))
df$clust <- rep(c("95%", "96%", "97%", "98%"), 8)
df$variable <- paste(df$variable, df$clust, sep = " ")
df$value <- as.numeric(df$value)
df$primer <- factor(df$primer, levels = c("Zeale", "Gillet"))
df$variable <- factor(df$variable, levels = c("order 95%",
"order 96%",
"order 97%",
"order 98%",
"family 95%",
"family 96%",
"family 97%",
"family 98%",
"genus 95%",
"genus 96%",
"genus 97%",
"genus 98%",
"species 95%",
"species 96%",
"species 97%",
"species 98%"))
p2 <- ggplot(df, aes(x = variable, y=value)) +
geom_bar(aes(fill = primer),
stat = "identity",
color = "black",
position = position_dodge()) +
theme_classic() +
scale_fill_manual(values = c("darkblue", "pink")) +
scale_x_discrete(labels=c("order 95%" = "Order - 95%",
"order 96%" = "Order - 96%",
"order 97%" = "Order - 97%",
"order 98%" = "Order - 98%",
"family 95%" = "Family - 95%",
"family 96%" = "Family - 96%",
"family 97%" = "Family - 97%",
"family 98%" = "Family - 98%",
"genus 95%" = "Genus - 95%",
"genus 96%" = "Genus - 96%",
"genus 97%" = "Genus - 97%",
"genus 98%" = "Genus - 98%",
"species 95%" = "Species - 95%",
"species 96%" = "Species - 96%",
"species 97%" = "Species - 97%",
"species 98%" = "Species - 98%")) +
theme(axis.text.x = element_text(size = 12, angle = 45, hjust = 1,
face = "bold"),
axis.text.y = element_text(size = 12, face = "bold")) +
labs(y = "Proportion (%) Identified") +
ggtitle("Proportion of MOTUs identified") +
theme(legend.position = c(0.15,0.8)) +
theme(legend.title = element_blank(),
legend.text = element_text(size = 20)) +
#theme(legend.position = "bottom") +
theme(axis.title.x = element_blank(),
axis.title.y = element_text(size = 15, face = "bold"))
p2